Introduction

In this report, we extract information about published JOSS papers and generate graphics as well as a summary table that can be downloaded and used for further analyses.

Load required R packages

suppressPackageStartupMessages({
  library(tibble)
  library(rcrossref)
  library(dplyr)
  library(tidyr)
  library(ggplot2)
  library(lubridate)
  library(gh)
  library(purrr)
  library(jsonlite)
  library(DT)
  library(plotly)
})

Collect information about papers

Pull down papers and citation info from Crossref

We get the information about published JOSS papers from Crossref, using the rcrossref R package. This package is also used to extract citation counts.

## Fetch JOSS papers from Crossref
## Only 1000 papers at the time can be pulled down
lim <- 1000
papers <- rcrossref::cr_works(filter = c(issn = "2475-9066"), 
                              limit = lim)$data
## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
i <- 1
while (nrow(papers) == i * lim) {
  papers <- dplyr::bind_rows(
    papers, 
    rcrossref::cr_works(filter = c(issn = "2475-9066"), 
                        limit = lim, offset = i * lim)$data)
  i <- i + 1
}
papers <- papers %>%
  dplyr::filter(type == "journal-article") 

## A few papers don't have DOIs - generate them from the URL
noaltid <- which(is.na(papers$alternative.id))
papers$alternative.id[noaltid] <- gsub("http://dx.doi.org/", "",
                                       papers$url[noaltid])

## Get citation info from Crossref and merge with paper details
cit <- rcrossref::cr_citation_count(doi = papers$alternative.id)
papers <- papers %>% dplyr::left_join(
  cit %>% dplyr::rename(citation_count = count), 
  by = c("alternative.id" = "doi")
)

## Remove one duplicated paper
papers <- papers %>% dplyr::filter(alternative.id != "10.21105/joss.00688")

Pull down info from Whedon API

For each published paper, we use the Whedon API to get information about pre-review and review issue numbers, corresponding software repository etc.

whedon <- list()
p <- 1
a <- jsonlite::fromJSON(
  url(paste0("https://joss.theoj.org/papers/published.json?page=", p)),
  simplifyDataFrame = FALSE
)
while (length(a) > 0) {
  whedon <- c(whedon, a)
  p <- p + 1
  a <- jsonlite::fromJSON(
    url(paste0("https://joss.theoj.org/papers/published.json?page=", p)),
    simplifyDataFrame = FALSE
  )
}

whedon <- do.call(dplyr::bind_rows, lapply(whedon, function(w) {
  data.frame(api_title = w$title, 
             api_state = w$state,
             repo_url = w$repository_url,
             review_issue_id = w$review_issue_id,
             doi = w$doi,
             prereview_issue_id = ifelse(!is.null(w$meta_review_issue_id),
                                         w$meta_review_issue_id, NA_integer_),
             languages = paste(w$metadata$paper$languages, collapse = ","),
             archive_doi = w$metadata$paper$archive_doi)
}))

papers <- papers %>% dplyr::left_join(whedon, by = c("alternative.id" = "doi"))

Combine with info from GitHub issues

From each pre-review and review issue, we extract information about review times and assigned labels.

## Pull down info on all issues in the joss-reviews repository
issues <- gh("/repos/openjournals/joss-reviews/issues", 
             .limit = 5000, state = "all")
## From each issue, extract required information
iss <- do.call(dplyr::bind_rows, lapply(issues, function(i) {
  data.frame(title = i$title, 
             number = i$number,
             state = i$state,
             opened = i$created_at,
             closed = ifelse(!is.null(i$closed_at),
                             i$closed_at, NA_character_),
             ncomments = i$comments,
             labels = paste(setdiff(
               vapply(i$labels, getElement, 
                      name = "name", character(1L)),
               c("review", "pre-review", "query-scope", "paused")),
               collapse = ","))
}))

## Split into REVIEW, PRE-REVIEW, and other issues (the latter category 
## is discarded)
issother <- iss %>% dplyr::filter(!grepl("\\[PRE REVIEW\\]", title) & 
                                    !grepl("\\[REVIEW\\]", title))
dim(issother)
## [1] 26  7
head(issother)
##                                                       title number  state
## 1 @torressa @poulson I only found a couple of small issues:   2082 closed
## 2                         Request to regenerate final proof   2045 closed
## 3                     issues running example program Karate   2015 closed
## 4                                          @whedon commands   1898 closed
## 5                                                   @whedon   1897 closed
## 6                                    Updated issue template   1510 closed
##                 opened               closed ncomments labels
## 1 2020-02-07T09:51:50Z 2020-02-07T09:52:09Z         2       
## 2 2020-01-28T14:44:07Z 2020-01-28T14:45:26Z         2       
## 3 2020-01-15T13:25:37Z 2020-01-15T15:05:18Z         3       
## 4 2019-11-17T09:44:23Z 2019-11-17T10:26:41Z         4       
## 5 2019-11-17T09:43:49Z 2019-11-17T10:26:30Z         4       
## 6 2019-06-18T19:13:46Z 2019-06-18T19:43:30Z         0
## For REVIEW issues, generate the DOI of the paper from the issue number
getnbrzeros <- function(s) {
  paste(rep(0, 5 - nchar(s)), collapse = "")
}
issrev <- iss %>% dplyr::filter(grepl("\\[REVIEW\\]", title)) %>%
  dplyr::mutate(nbrzeros = purrr::map_chr(number, getnbrzeros)) %>%
  dplyr::mutate(alternative.id = paste0("10.21105/joss.", 
                                        nbrzeros,
                                        number)) %>%
  dplyr::select(-nbrzeros) %>% 
  dplyr::mutate(title = gsub("\\[REVIEW\\]: ", "", title)) %>%
  dplyr::rename_at(vars(-alternative.id), ~ paste0("review_", .))

## For PRE-REVIEW issues, add information about the corresponding REVIEW 
## issue number
isspre <- iss %>% dplyr::filter(grepl("\\[PRE REVIEW\\]", title)) %>%
  dplyr::filter(!grepl("withdrawn", labels)) %>%
  dplyr::filter(!grepl("rejected", labels))
## Some titles have multiple pre-review issues. In these cases, keep the latest
isspre <- isspre %>% dplyr::arrange(desc(number)) %>% 
  dplyr::filter(!duplicated(title)) %>% 
  dplyr::mutate(title = gsub("\\[PRE REVIEW\\]: ", "", title)) %>%
  dplyr::rename_all(~ paste0("prerev_", .))

papers <- papers %>% dplyr::left_join(issrev, by = "alternative.id") %>% 
  dplyr::left_join(isspre, by = c("prereview_issue_id" = "prerev_number")) %>%
  dplyr::mutate(prerev_opened = as.Date(prerev_opened),
                prerev_closed = as.Date(prerev_closed),
                review_opened = as.Date(review_opened),
                review_closed = as.Date(review_closed)) %>% 
  dplyr::mutate(days_in_pre = prerev_closed - prerev_opened,
                days_in_rev = review_closed - review_opened,
                to_review = !is.na(review_opened))

Add information from software repositories

software_urls <- papers$repo_url
is_github <- grepl("github", software_urls)
length(is_github)
## [1] 983
sum(is_github)
## [1] 946
software_urls[!is_github]
##  [1] "https://bitbucket.org/cloopsy/android/"                    
##  [2] "https://bitbucket.org/manuela_s/hcp/"                      
##  [3] "https://gitlab.com/celliern/scikit-fdiff/"                 
##  [4] "https://doi.org/10.17605/OSF.IO/3DS6A"                     
##  [5] "https://bitbucket.org/glotzer/rowan"                       
##  [6] "https://gitlab.com/moorepants/skijumpdesign"               
##  [7] "https://gitlab.com/toposens/public/ros-packages"           
##  [8] "https://gitlab.inria.fr/azais/treex"                       
##  [9] "https://bitbucket.org/basicsums/basicsums"                 
## [10] "https://savannah.nongnu.org/projects/complot/"             
## [11] "http://mutabit.com/repos.fossil/grafoscopio/"              
## [12] "https://bitbucket.org/cardosan/brightway2-temporalis"      
## [13] "https://bitbucket.org/cdegroot/wediff"                     
## [14] "https://bitbucket.org/meg/cbcbeat"                         
## [15] "https://bitbucket.org/likask/mofem-cephas"                 
## [16] "https://vcs.ynic.york.ac.uk/analysis/sails"                
## [17] "https://bitbucket.org/ocellarisproject/ocellaris"          
## [18] "https://gitlab.com/QComms/cqptoolkit"                      
## [19] "https://gitlab.com/dlr-dw/ontocode"                        
## [20] "https://gitlab.com/eidheim/Simple-Web-Server"              
## [21] "https://bitbucket.org/dghoshal/frieda"                     
## [22] "https://gitlab.com/tesch1/cppduals"                        
## [23] "https://gitlab.com/gdetor/genetic_alg"                     
## [24] "https://bitbucket.org/hammurabicode/hamx"                  
## [25] "https://gitlab.com/datafold-dev/datafold/"                 
## [26] "https://www.idpoisson.fr/fullswof/"                        
## [27] "https://sourceforge.net/p/mcapl/mcapl_code/ci/master/tree/"
## [28] "https://gricad-gitlab.univ-grenoble-alpes.fr/ttk/spam/"    
## [29] "https://c4science.ch/source/tamaas/"                       
## [30] "https://gitlab.inria.fr/miet/miet"                         
## [31] "https://bitbucket.org/mpi4py/mpi4py-fft"                   
## [32] "https://gitlab.com/myqueue/myqueue"                        
## [33] "https://bitbucket.org/cmutel/brightway2"                   
## [34] "https://gitlab.com/cerfacs/batman"                         
## [35] "https://gitlab.com/materials-modeling/wulffpack"           
## [36] "https://bitbucket.org/dolfin-adjoint/pyadjoint"            
## [37] "https://gitlab.com/costrouc/pysrim"
df <- do.call(dplyr::bind_rows, lapply(software_urls[is_github], function(u) {
  u0 <- gsub("\\.git$", "", gsub("/$", "", u))
  info <- try({
    gh(gsub("(https://)?(www.)?github.com/", "/repos/", u0))
  })
  contribs <- try({
    gh(paste0(gsub("(https://)?(www.)?github.com/", "/repos/", u0), "/contributors"), 
       .limit = 500)
  })
  if (!is(info, "try-error") && length(info) > 1) {
    if (!is(contribs, "try-error")) {
      if (length(contribs) == 0) {
        repo_nbr_contribs <- repo_nbr_contribs_2ormore <- NA_integer_
      } else {
        repo_nbr_contribs <- length(contribs)
        repo_nbr_contribs_2ormore <- sum(vapply(contribs, function(x) x$contributions >= 2, NA_integer_))
        if (is.na(repo_nbr_contribs_2ormore)) {
          print(contribs)
        }
      }
    } else {
      repo_nbr_contribs <- repo_nbr_contribs_2ormore <- NA_integer_
    }
    data.frame(repo_url = u, 
               repo_created = info$created_at,
               repo_updated = info$updated_at,
               repo_pushed = info$pushed_at,
               repo_nbr_stars = info$stargazers_count,
               repo_language = ifelse(!is.null(info$language),
                                      info$language, NA_character_),
               repo_license = ifelse(!is.null(info$license),
                                     info$license$key, NA_character_),
               repo_nbr_contribs = repo_nbr_contribs,
               repo_nbr_contribs_2ormore = repo_nbr_contribs_2ormore
    )
  } else {
    NULL
  }
})) %>%
  dplyr::mutate(repo_created = as.Date(repo_created),
                repo_updated = as.Date(repo_updated),
                repo_pushed = as.Date(repo_pushed)) %>%
  dplyr::distinct()
stopifnot(length(unique(df$repo_url)) == length(df$repo_url))
papers <- papers %>% dplyr::left_join(df, by = "repo_url")

Clean up a bit

## Convert publication date to Date format
## Add information about the half year (H1, H2) of publication
## Count number of authors
papers <- papers %>% dplyr::select(-reference, -license, -link) %>%
  dplyr::mutate(published.date = as.Date(published.print)) %>% 
  dplyr::mutate(
    halfyear = paste0(year(published.date), 
                      ifelse(month(published.date) <= 6, "H1", "H2"))
  ) %>% dplyr::mutate(
    halfyear = factor(halfyear, 
                      levels = paste0(rep(sort(unique(year(published.date))), 
                                          each = 2), c("H1", "H2")))
  ) %>% dplyr::mutate(nbr_authors = vapply(author, function(a) nrow(a), NA_integer_))

Number of published papers per month and year

ggplot(papers %>% 
         dplyr::mutate(pubmonth = lubridate::floor_date(published.date, "month")) %>%
         dplyr::group_by(pubmonth) %>%
         dplyr::summarize(npub = n()), 
       aes(x = factor(pubmonth), y = npub)) + 
  geom_bar(stat = "identity") + theme_minimal() + 
  labs(x = "", y = "Number of published papers per month") + 
  theme(axis.title = element_text(size = 15),
        axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

ggplot(papers %>% 
         dplyr::mutate(pubyear = lubridate::year(published.date)) %>%
         dplyr::group_by(pubyear) %>%
         dplyr::summarize(npub = n()), 
       aes(x = factor(pubyear), y = npub)) + 
  geom_bar(stat = "identity") + theme_minimal() + 
  labs(x = "", y = "Number of published papers per year") + 
  theme(axis.title = element_text(size = 15),
        axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

Citation distribution

Papers with 20 or more citations are grouped in the “>=20” category.

ggplot(papers %>% 
         dplyr::mutate(citation_count = replace(citation_count,
                                                citation_count >= 20, ">=20")) %>%
         dplyr::mutate(citation_count = factor(citation_count, 
                                               levels = c(0:20, ">=20"))) %>%
         dplyr::group_by(citation_count) %>%
         dplyr::tally(),
       aes(x = citation_count, y = n)) + 
  geom_bar(stat = "identity") + 
  theme_minimal() + 
  labs(x = "Crossref citation count", y = "Number of publications")

Most cited papers

The table below sorts the JOSS papers in decreasing order by the number of citations in Crossref.

DT::datatable(
  papers %>% 
    dplyr::mutate(url = paste0("<a href='", url, "' target='_blank'>", 
                               url,"</a>")) %>% 
    dplyr::arrange(desc(citation_count)) %>% 
    dplyr::select(title, url, published.date, citation_count),
  escape = FALSE,
  options = list(scrollX = TRUE)
)

Citation count vs time since publication

plotly::ggplotly(
  ggplot(papers, aes(x = published.date, y = citation_count, label = title)) + 
    geom_point(alpha = 0.5) + theme_bw() + scale_y_sqrt() + 
    geom_smooth() + 
    labs(x = "Date of publication", y = "Crossref citation count") + 
    theme(axis.title = element_text(size = 15)),
  tooltip = c("label", "x", "y")
)

Power law of citation count within each half year

Here, we plot the citation count for all papers published within each half year, sorted in decreasing order.

ggplot(papers %>% dplyr::group_by(halfyear) %>% 
         dplyr::arrange(desc(citation_count)) %>%
         dplyr::mutate(idx = seq_along(citation_count)), 
       aes(x = idx, y = citation_count)) + 
  geom_point(alpha = 0.5) + 
  facet_wrap(~ halfyear, scales = "free") + 
  theme_bw() + 
  labs(x = "Index", y = "Crossref citation count")

Pre-review/review time over time

In these plots we investigate whether the time a submission spends in the pre-review or review stage has changed over time.

ggplot(papers, aes(x = prerev_opened, y = as.numeric(days_in_pre))) + 
  geom_point() + geom_smooth() + theme_bw() + 
  scale_y_sqrt() + 
  labs(x = "Date of pre-review opening", y = "Number of days in pre-review") + 
  theme(axis.title = element_text(size = 15))

ggplot(papers, aes(x = review_opened, y = as.numeric(days_in_rev))) + 
  geom_point() + geom_smooth() + theme_bw() + 
  scale_y_sqrt() + 
  labs(x = "Date of review opening", y = "Number of days in review") + 
  theme(axis.title = element_text(size = 15))

Languages

Next, we consider the languages used by the submissions. Note that a given submission can use multiple languages.

sspl <- strsplit(papers$languages, ",")
all_languages <- unique(unlist(sspl))
langs <- do.call(dplyr::bind_rows, lapply(all_languages, function(l) {
  data.frame(language = l,
             nbr_submissions = sum(vapply(sspl, function(v) l %in% v, 0)))
}))
langs %>% dplyr::arrange(desc(nbr_submissions))
##            language nbr_submissions
## 1               TeX             838
## 2            Python             556
## 3                 R             259
## 4             Shell             251
## 5          Makefile             181
## 6               C++             142
## 7              HTML             122
## 8  Jupyter Notebook             111
## 9                 C              73
## 10              CSS              65
## 11       JavaScript              55
## 12        Batchfile              53
## 13            CMake              51
## 14           Matlab              45
## 15            Julia              31
## 16          Fortran              24
## 17             Ruby              19
## 18             Java              19
## 19       PowerShell              18
## 20             Perl              11
## 21            Rebol               9
## 22               Go               7
## 23             GLSL               7
## 24               M4               6
## 25               C#               6
## 26             Roff               6
## 27                M               6
## 28             Rust               5
## 29           Prolog               4
## 30             Cuda               4
## 31              PHP               4
## 32           Smarty               4
## 33              IDL               3
## 34            Scala               3
## 35            QMake               3
## 36      Mathematica               3
## 37              Awk               2
## 38             NSIS               2
## 39        Smalltalk               2
## 40         Assembly               2
## 41           Groovy               2
## 42             Stan               2
## 43             XSLT               2
## 44         IGOR Pro               2
## 45              Vue               2
## 46      Objective-C               2
## 47            OCaml               2
## 48              Tcl               2
## 49              GAP               2
## 50              Lua               2
## 51             Mako               1
## 52       AGS Script               1
## 53            PLSQL               1
## 54      Common Lisp               1
## 55               eC               1
## 56            Lasso               1
## 57          Gnuplot               1
## 58            Stata               1
## 59              wdl               1
## 60       Emacs Lisp               1
## 61           Kotlin               1
## 62       Inno Setup               1
## 63            Logos               1
## 64             Yacc               1
## 65     CoffeeScript               1
## 66               F#               1
## 67           XQuery               1
## 68                D               1
## 69             Golo               1
## 70           Scheme               1
## 71           Puppet               1
## 72            HyPhy               1
## 73     UnrealScript               1
## 74            ANTLR               1
## 75              Max               1
## 76              QML               1
## 77           JSONiq               1
## 78       TypeScript               1
## 79       FreeMarker               1
ggplot(langs %>% dplyr::arrange(desc(nbr_submissions)) %>%
         dplyr::mutate(language = factor(language, levels = language)),
       aes(x = language, y = nbr_submissions)) + 
  geom_bar(stat = "identity") + 
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 
  labs(x = "", y = "Number of submissions") + 
  theme(axis.title = element_text(size = 15))

Association between number of citations and number of stars of the GitHub repo

ggplotly(
  ggplot(papers, aes(x = citation_count, y = repo_nbr_stars,
                     label = title)) + 
    geom_point(alpha = 0.5) + scale_x_sqrt() + scale_y_sqrt() + 
    theme_bw() + 
    labs(x = "Crossref citation count", y = "Number of stars, GitHub repo") + 
    theme(axis.title = element_text(size = 15)),
  tooltip = c("label", "x", "y")
)

Distribution of time between GitHub repo creation and JOSS submission

ggplot(papers, aes(x = as.numeric(prerev_opened - repo_created))) +
  geom_histogram(bins = 50) + 
  theme_bw() + 
  labs(x = "Time (days) from repo creation to JOSS pre-review start") + 
  theme(axis.title = element_text(size = 15))

Distribution of time between JOSS acceptance and last commit

ggplot(papers, aes(x = as.numeric(repo_pushed - review_closed))) +
  geom_histogram(bins = 50) + 
  theme_bw() + 
  labs(x = "Time (days) from closure of JOSS review to most recent commit in repo") + 
  theme(axis.title = element_text(size = 15)) + 
  facet_wrap(~ year(published.date), scales = "free_y")

Number of authors per paper

List the papers with the largest number of authors, and display the distribution of the number of authors per paper, for papers with at most 20 authors.

## Papers with largest number of authors
papers %>% dplyr::arrange(desc(nbr_authors)) %>% 
  dplyr::select(title, published.date, url, nbr_authors) %>%
  as.data.frame() %>% head(10)
##                                                                                                  title
## 1                                                            SunPy: A Python package for Solar Physics
## 2                                ENZO: An Adaptive Mesh Refinement Code for Astrophysics (Version 2.6)
## 3                                                               PyBIDS: Python tools for BIDS datasets
## 4                                                    Chaste: Cancer, Heart and Soft Tissue Environment
## 5                                                   spam: Software for Practical Analysis of Materials
## 6                                                                VIVO: a system for research discovery
## 7                                                                             Welcome to the Tidyverse
## 8                 Pyglmnet: Python implementation of elastic-net regularized generalized linear models
## 9                                     AMReX: a framework for block-structured adaptive mesh refinement
## 10 MNE-BIDS: Organizing electrophysiological data into the BIDS format and facilitating their analysis
##    published.date                                   url nbr_authors
## 1      2020-02-14 http://dx.doi.org/10.21105/joss.01832         124
## 2      2019-10-03 http://dx.doi.org/10.21105/joss.01636          55
## 3      2019-08-12 http://dx.doi.org/10.21105/joss.01294          31
## 4      2020-03-13 http://dx.doi.org/10.21105/joss.01848          29
## 5      2020-07-13 http://dx.doi.org/10.21105/joss.02286          27
## 6      2019-07-26 http://dx.doi.org/10.21105/joss.01182          25
## 7      2019-11-21 http://dx.doi.org/10.21105/joss.01686          24
## 8      2020-03-01 http://dx.doi.org/10.21105/joss.01959          22
## 9      2019-05-12 http://dx.doi.org/10.21105/joss.01370          17
## 10     2019-12-18 http://dx.doi.org/10.21105/joss.01896          16
nbins <- max(papers$nbr_authors[papers$nbr_authors <= 20])
ggplot(papers %>% dplyr::filter(nbr_authors <= 20),
  aes(x = nbr_authors)) + 
  geom_histogram(bins = nbins, fill = "lightgrey", color = "grey50") + 
  theme_bw() + 
  facet_wrap(~ year(published.date), scales = "free_y") + 
  theme(axis.title = element_text(size = 15)) + 
  labs(x = "Number of authors", y = "Number of publications with\na given number of authors")

ggplot(papers %>% 
         dplyr::mutate(nbr_authors = replace(nbr_authors, nbr_authors > 5, ">5")) %>%
         dplyr::mutate(nbr_authors = factor(nbr_authors, levels = c("1", "2", "3", 
                                                                    "4", "5", ">5"))) %>%
         dplyr::mutate(year = year(published.date)) %>%
         dplyr::mutate(year = factor(year)) %>%
         dplyr::group_by(year, nbr_authors, .drop = FALSE) %>%
         dplyr::summarize(n = n()) %>%
         dplyr::mutate(freq = n/sum(n)) %>%
         dplyr::mutate(year = as.integer(as.character(year))), 
       aes(x = year, y = freq, fill = nbr_authors)) + geom_area() + 
  theme_minimal() + 
  scale_fill_brewer(palette = "Set1", name = "Number of\nauthors", 
                    na.value = "grey") + 
  theme(axis.title = element_text(size = 15)) + 
  labs(x = "Year", y = "Fraction of submissions")

Number of authors vs number of contributors to the GitHub repo

Note that points are slightly jittered to reduce the overlap.

plotly::ggplotly(
  ggplot(papers, aes(x = nbr_authors, y = repo_nbr_contribs_2ormore, label = title)) + 
    geom_abline(slope = 1, intercept = 0) + 
    geom_jitter(width = 0.05, height = 0.05, alpha = 0.5) + 
    # geom_point(alpha = 0.5) + 
    theme_bw() + 
    scale_x_sqrt() + scale_y_sqrt() + 
    labs(x = "Number of authors", y = "Number of contributors\nwith at least 2 commits") + 
    theme(axis.title = element_text(size = 15)),
  tooltip = c("label", "x", "y")
)

Distribution of software repo licenses

ggplot(papers, aes(x = repo_license)) +
  geom_bar() + 
  theme_bw() + 
  labs(x = "Software license", y = "Number of submissions") + 
  theme(axis.title = element_text(size = 15),
        axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 
  facet_wrap(~ year(published.date), scales = "free_y")

## For plots below, replace licenses present in less 
## than 2.5% of the submissions by 'other'
tbl <- table(papers$repo_license)
to_replace <- names(tbl[tbl <= 0.025 * nrow(papers)])
ggplot(papers %>% 
         dplyr::mutate(year = year(published.date)) %>%
         dplyr::mutate(repo_license = replace(repo_license, 
                                              repo_license %in% to_replace,
                                              "other")) %>%
         dplyr::mutate(year = factor(year), repo_license = factor(repo_license)) %>%
         dplyr::group_by(year, repo_license, .drop = FALSE) %>%
         dplyr::count() %>%
         dplyr::mutate(year = as.integer(as.character(year))), 
       aes(x = year, y = n, fill = repo_license)) + geom_area() + 
  theme_minimal() + 
  scale_fill_brewer(palette = "Set1", name = "Software\nlicense", 
                    na.value = "grey") + 
  theme(axis.title = element_text(size = 15)) + 
  labs(x = "Year", y = "Number of submissions")

ggplot(papers %>% 
         dplyr::mutate(year = year(published.date)) %>%
         dplyr::mutate(repo_license = replace(repo_license, 
                                              repo_license %in% to_replace,
                                              "other")) %>%
         dplyr::mutate(year = factor(year), repo_license = factor(repo_license)) %>%
         dplyr::group_by(year, repo_license, .drop = FALSE) %>%
         dplyr::summarize(n = n()) %>%
         dplyr::mutate(freq = n/sum(n)) %>%
         dplyr::mutate(year = as.integer(as.character(year))), 
       aes(x = year, y = freq, fill = repo_license)) + geom_area() + 
  theme_minimal() + 
  scale_fill_brewer(palette = "Set1", name = "Software\nlicense", 
                    na.value = "grey") + 
  theme(axis.title = element_text(size = 15)) + 
  labs(x = "Year", y = "Fraction of submissions")

Save object

The tibble object with all data collected above is serialized to a file that can be downloaded and reused. To read the current version of this file directly from GitHub, use the following code:

papers <- readRDS(gzcon(url("https://github.com/openjournals/joss-analytics/blob/gh-pages/joss_submission_analytics.rds?raw=true")))
head(papers) %>% as.data.frame()
##        alternative.id                     container.title    created  deposited
## 1 10.21105/joss.00900     Journal of Open Source Software 2018-09-23 2018-09-23
## 2 10.21105/joss.01423     Journal of Open Source Software 2019-05-08 2019-11-17
## 3 10.21105/joss.00011 The Journal of Open Source Software 2016-05-12 2017-10-24
## 4 10.21105/joss.01614     Journal of Open Source Software 2019-08-20 2019-11-17
## 5 10.21105/joss.00782     Journal of Open Source Software 2018-06-29 2018-06-29
## 6 10.21105/joss.00082 The Journal of Open Source Software 2016-10-27 2019-09-15
##   published.print                 doi    indexed      issn issue     issued
## 1      2018-09-23 10.21105/joss.00900 2020-03-10 2475-9066    29 2018-09-23
## 2      2019-05-08 10.21105/joss.01423 2020-04-07 2475-9066    37 2019-05-08
## 3      2016-05-11 10.21105/joss.00011 2020-02-21 2475-9066     1 2016-05-11
## 4      2019-08-20 10.21105/joss.01614 2020-02-14 2475-9066    40 2019-08-20
## 5      2018-06-29 10.21105/joss.00782 2020-03-10 2475-9066    26 2018-06-29
## 6      2016-10-27 10.21105/joss.00082 2020-01-30 2475-9066     6 2016-10-27
##   member page   prefix        publisher reference.count score   source
## 1   8722  900 10.21105 The Open Journal               9     1 Crossref
## 2   8722 1423 10.21105 The Open Journal               9     1 Crossref
## 3   8722   11 10.21105 The Open Journal               1     1 Crossref
## 4   8722 1614 10.21105 The Open Journal               7     1 Crossref
## 5   8722  782 10.21105 The Open Journal               5     1 Crossref
## 6   8722   82 10.21105 The Open Journal               5     1 Crossref
##                                                                                                                    title
## 1                                                                              GB code: A grain boundary generation code
## 2                                                                            CRED: a rapid peak caller for Chem-seq data
## 3                                                                              carl: a likelihood-free inference toolbox
## 4                                                                  drms: A Python package for accessing HMI and AIA data
## 5             py-opc: operate the Alphasense OPC-N2 from a raspberry pi or other popular microcontrollers/microcomputers
## 6 Habfuzz: A tool to calculate the instream hydraulic habitat suitability using fuzzy logic and fuzzy Bayesian inference
##              type                                   url volume
## 1 journal-article http://dx.doi.org/10.21105/joss.00900      3
## 2 journal-article http://dx.doi.org/10.21105/joss.01423      4
## 3 journal-article http://dx.doi.org/10.21105/joss.00011      1
## 4 journal-article http://dx.doi.org/10.21105/joss.01614      4
## 5 journal-article http://dx.doi.org/10.21105/joss.00782      3
## 6 journal-article http://dx.doi.org/10.21105/joss.00082      1
##                                                                                                                                                                                                                                                                                                                                                                        author
## 1                                                                                                                                                                                                  http://orcid.org/0000-0002-9616-4602, http://orcid.org/0000-0003-4281-5665, NA, FALSE, FALSE, NA, R., B., J., Hadian, Grabowski, Neugebauer, first, additional, additional
## 2                                                                                          http://orcid.org/0000-0002-8086-3185, http://orcid.org/0000-0003-2358-7919, http://orcid.org/0000-0002-0916-7339, http://orcid.org/0000-0002-3992-5399, FALSE, FALSE, FALSE, FALSE, Jason, Tony, Paul, Hiroki, Lin, Kuo, Horton, Nagase, first, additional, additional, additional
## 3                                                                                                                                                            http://orcid.org/0000-0002-2082-3106, http://orcid.org/0000-0002-5769-7094, http://orcid.org/0000-0002-7205-0053, FALSE, FALSE, FALSE, Gilles, Kyle, Juan, Louppe, Cranmer, Pavez, first, additional, additional
## 4 http://orcid.org/0000-0002-1361-5712, http://orcid.org/0000-0002-5662-9604, http://orcid.org/0000-0001-6915-4583, http://orcid.org/0000-0002-0361-6463, http://orcid.org/0000-0003-4217-4642, FALSE, FALSE, FALSE, FALSE, FALSE, Kolja, Monica, Nitin, Arthur, Stuart, Glogowski, Bobra, Choudhary, Amezcua, Mumford, first, additional, additional, additional, additional
## 5                                                                                                                                                                                                                                  http://orcid.org/0000-0001-5111-4671, NA, NA, FALSE, NA, NA, David, Andrew, Jakub, H Hagan, Tolmie, Trochim, first, additional, additional
## 6                                                                                                                                                                                                             http://orcid.org/0000-0002-5395-0347, NA, NA, FALSE, NA, NA, Christos, Nikolaos, Anastasios, Theodoropoulos, Skoulikidis, Stamou, first, additional, additional
##   citation_count
## 1              1
## 2              1
## 3              3
## 4              2
## 5              1
## 6              7
##                                                                                                                api_title
## 1                                                                              GB code: A grain boundary generation code
## 2                                                                            CRED: a rapid peak caller for Chem-seq data
## 3                                                                              carl: a likelihood-free inference toolbox
## 4                                                                  drms: A Python package for accessing HMI and AIA data
## 5             py-opc: operate the Alphasense OPC-N2 from a raspberry pi or other popular microcontrollers/microcomputers
## 6 Habfuzz: A tool to calculate the instream hydraulic habitat suitability using fuzzy logic and fuzzy Bayesian inference
##   api_state                              repo_url review_issue_id
## 1  accepted  https://github.com/oekosheri/GB_code             900
## 2  accepted      https://github.com/jlincbio/cred            1423
## 3  accepted     https://github.com/diana-hep/carl              11
## 4  accepted         https://github.com/sunpy/drms            1614
## 5  accepted     https://github.com/dhhagan/py-opc             782
## 6  accepted https://github.com/chtheodoro/Habfuzz              82
##   prereview_issue_id                        languages
## 1                853                       Python,TeX
## 2               1374              Makefile,Perl,C,TeX
## 3                 NA            Python,Mako,Shell,TeX
## 4               1559                       Python,TeX
## 5                540                       Python,TeX
## 6                 77 Fortran,Shell,Batchfile,HTML,TeX
##                                archive_doi
## 1   https://doi.org/10.5281/zenodo.1433530
## 2   https://doi.org/10.5281/zenodo.2667613
## 3   http://dx.doi.org/10.5281/zenodo.47798
## 4   https://doi.org/10.5281/zenodo.3369966
## 5 http://dx.doi.org/10.5281/zenodo.1299205
## 6 https://dx.doi.org/10.5281/zenodo.163291
##                                                                                                 review_title
## 1                                                                  GB_code: A grain boundary generation code
## 2                                                                CRED: a rapid peak caller for Chem-seq data
## 3                                                                  carl: a likelihood-free inference toolbox
## 4                                                      drms: A Python package for accessing HMI and AIA data
## 5 py-opc: operate the Alphasense OPC-N2 from a raspberry pi or other popular microcontrollers/microcomputers
## 6       Habfuzz: A Fortran tool to calculate the instream hydraulic habitat suitability based on fuzzy logic
##   review_number review_state review_opened review_closed review_ncomments
## 1           900       closed    2018-08-17    2018-09-23               90
## 2          1423       closed    2019-05-01    2019-05-08               93
## 3            11       closed    2016-05-04    2016-05-11               36
## 4          1614       closed    2019-08-01    2019-08-20               62
## 5           782       closed    2018-06-18    2018-06-29               63
## 6            82       closed    2016-09-27    2016-10-27               41
##                         review_labels
## 1 accepted,published,recommend-accept
## 2 accepted,published,recommend-accept
## 3 accepted,published,recommend-accept
## 4 accepted,published,recommend-accept
## 5 accepted,published,recommend-accept
## 6 accepted,published,recommend-accept
##                                                                                                 prerev_title
## 1                                                                  GB_code: A grain boundary generation code
## 2                                                                CRED: a rapid peak caller for Chem-seq data
## 3                                                                                                       <NA>
## 4                                                      drms: A Python package for accessing HMI and AIA data
## 5 py-opc: operate the Alphasense OPC-N2 from a raspberry pi or other popular microcontrollers/microcomputers
## 6       Habfuzz: A Fortran tool to calculate the instream hydraulic habitat suitability based on fuzzy logic
##   prerev_state prerev_opened prerev_closed prerev_ncomments   prerev_labels
## 1       closed    2018-07-26    2018-08-17               42      Python,TeX
## 2       closed    2019-04-14    2019-05-01               30 C,Makefile,Perl
## 3         <NA>          <NA>          <NA>               NA            <NA>
## 4       closed    2019-07-11    2019-08-01               29          Python
## 5       closed    2018-01-19    2018-06-18               34      Python,TeX
## 6       closed    2016-09-21    2016-09-27               16                
##   days_in_pre days_in_rev to_review repo_created repo_updated repo_pushed
## 1     22 days     37 days      TRUE   2018-07-12   2020-07-06  2019-08-08
## 2     17 days      7 days      TRUE   2019-04-10   2020-02-26  2020-02-26
## 3     NA days      7 days      TRUE   2015-11-23   2020-07-22  2017-05-04
## 4     21 days     19 days      TRUE   2016-05-12   2020-04-16  2020-04-16
## 5    150 days     11 days      TRUE   2015-02-15   2020-06-04  2019-08-16
## 6      6 days     30 days      TRUE   2016-09-20   2020-02-06  2020-02-06
##   repo_nbr_stars repo_language repo_license repo_nbr_contribs
## 1             11        Python          mit                 2
## 2              1             C      gpl-3.0                 2
## 3             55        Python bsd-3-clause                 5
## 4             11        Python          mit                 7
## 5             21        Python          mit                 7
## 6              0       Fortran   apache-2.0                 2
##   repo_nbr_contribs_2ormore published.date halfyear nbr_authors
## 1                         2     2018-09-23   2018H2           3
## 2                         1     2019-05-08   2019H1           4
## 3                         4     2016-05-11   2016H1           3
## 4                         5     2019-08-20   2019H2           5
## 5                         2     2018-06-29   2018H1           3
## 6                         1     2016-10-27   2016H2           3
saveRDS(papers, file = "joss_submission_analytics.rds")

Session info

sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] plotly_4.9.2.1  DT_0.15         jsonlite_1.7.0  purrr_0.3.4    
##  [5] gh_1.1.0        lubridate_1.7.9 ggplot2_3.3.2   tidyr_1.1.1    
##  [9] dplyr_1.0.2     rcrossref_1.0.0 tibble_3.0.3   
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.5         lattice_0.20-41    assertthat_0.2.1   digest_0.6.25     
##  [5] mime_0.9           R6_2.4.1           plyr_1.8.6         evaluate_0.14     
##  [9] httr_1.4.2         pillar_1.4.6       rlang_0.4.7        lazyeval_0.2.2    
## [13] curl_4.3           rstudioapi_0.11    data.table_1.13.0  miniUI_0.1.1.1    
## [17] Matrix_1.2-18      rmarkdown_2.3      splines_4.0.2      urltools_1.7.3    
## [21] labeling_0.3       stringr_1.4.0      htmlwidgets_1.5.1  triebeard_0.3.0   
## [25] munsell_0.5.0      shiny_1.5.0        compiler_4.0.2     httpuv_1.5.4      
## [29] xfun_0.16          pkgconfig_2.0.3    mgcv_1.8-31        htmltools_0.5.0   
## [33] tidyselect_1.1.0   httpcode_0.3.0     fansi_0.4.1        viridisLite_0.3.0 
## [37] crayon_1.3.4       withr_2.2.0        later_1.1.0.1      crul_1.0.0        
## [41] grid_4.0.2         nlme_3.1-148       xtable_1.8-4       gtable_0.3.0      
## [45] lifecycle_0.2.0    magrittr_1.5       scales_1.1.1       bibtex_0.4.2.2    
## [49] cli_2.0.2          stringi_1.4.6      farver_2.0.3       promises_1.1.1    
## [53] xml2_1.3.2         ellipsis_0.3.1     generics_0.0.2     vctrs_0.3.2       
## [57] RColorBrewer_1.1-2 tools_4.0.2        glue_1.4.1         crosstalk_1.1.0.1 
## [61] fastmap_1.0.1      yaml_2.2.1         colorspace_1.4-1   knitr_1.29